Rutuja Lohakare - Customer Data Analysis and Visualization¶

In [1]:
#importing the numpy library
import numpy as np

# for dataframe manipulations
import pandas as pd

# for Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# for data analysis
import dabl
In [72]:
#importing the dataset
data = pd.read_csv('R:\customer-data-marketing-campaign.csv')
In [73]:
#checking the shape of the dataset
print('Shape of the dataset: ', data.shape)
Shape of the dataset:  (2240, 29)
In [34]:
#checking the head of the data
data.head()
Out[34]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.0 0 0 04-09-2012 58 635 ... 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.0 1 1 08-03-2014 38 11 ... 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.0 1 0 10-02-2014 26 11 ... 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 5 0 0 0 0 0 0 3 11 0

5 rows × 29 columns

Data Visualization¶

In [35]:
#getting the pairplot of the data
sns.pairplot(data)
plt.show()
In [97]:
#Correlation heatmap for the data
sns.heatmap(data.corr(), annot = True, cmap = 'copper')
plt.title('Correlation Heatmap for Customer Data', fontsize = 16)
plt.show()
In [39]:
#analyzing score with respect to income
dabl.plot(data, target_col = 'Income')
Target looks like regression
Out[39]:
[<AxesSubplot: title={'center': 'Target distribution'}, xlabel='Income', ylabel='frequency'>,
 array([[<AxesSubplot: title={'center': 'F=8.43E-01'}, xlabel='MntWines', ylabel='Income'>,
         <AxesSubplot: title={'center': 'F=8.25E-01'}, xlabel='MntMeatProducts'>,
         <AxesSubplot: title={'center': 'F=7.99E-01'}, xlabel='NumCatalogPurchases (jittered)'>,
         <AxesSubplot: title={'center': 'F=7.50E-01'}, xlabel='NumStorePurchases (jittered)'>,
         <AxesSubplot: title={'center': 'F=-6.41E-01'}, xlabel='NumWebVisitsMonth (jittered)'>],
        [<AxesSubplot: title={'center': 'F=5.91E-01'}, xlabel='MntFruits', ylabel='Income'>,
         <AxesSubplot: title={'center': 'F=5.88E-01'}, xlabel='NumWebPurchases (jittered)'>,
         <AxesSubplot: title={'center': 'F=5.87E-01'}, xlabel='MntFishProducts'>,
         <AxesSubplot: title={'center': 'F=5.78E-01'}, xlabel='MntSweetProducts'>,
         <AxesSubplot: title={'center': 'F=5.19E-01'}, xlabel='MntGoldProds'>],
        [<AxesSubplot: title={'center': 'F=-2.21E-01'}, xlabel='Year_Birth', ylabel='Income'>,
         <AxesSubplot: title={'center': 'F=-1.95E-01'}, xlabel='NumDealsPurchases (jittered)'>,
         <AxesSubplot: title={'center': 'F=8.11E-03'}, xlabel='Recency'>,
         <AxesSubplot: title={'center': 'F=3.48E-03'}, xlabel='ID'>,
         <AxesSubplot: >]], dtype=object),
 array([[<AxesSubplot: title={'center': 'F=2.18E-01'}, xlabel='Income', ylabel='Education'>,
         <AxesSubplot: title={'center': 'F=1.77E-01'}, xlabel='Income', ylabel='Marital_Status'>,
         <AxesSubplot: title={'center': 'F=1.29E-01'}, xlabel='Income', ylabel='Kidhome'>],
        [<AxesSubplot: title={'center': 'F=1.21E-01'}, xlabel='Income', ylabel='Teenhome'>,
         <AxesSubplot: title={'center': 'F=8.29E-02'}, xlabel='Income', ylabel='AcceptedCmp3'>,
         <AxesSubplot: title={'center': 'F=7.32E-02'}, xlabel='Income', ylabel='AcceptedCmp4'>],
        [<AxesSubplot: title={'center': 'F=3.80E-02'}, xlabel='Income', ylabel='AcceptedCmp5'>,
         <AxesSubplot: title={'center': 'F=3.65E-02'}, xlabel='Income', ylabel='AcceptedCmp1'>,
         <AxesSubplot: title={'center': 'F=1.82E-02'}, xlabel='Income', ylabel='Response'>]],
       dtype=object)]
In [40]:
#analyzing the data with respect to Education
dabl.plot(data, target_col = 'Education')
Target looks like classification
Linear Discriminant Analysis training set score: 0.246
Out[40]:
[[<Figure size 2000x1500 with 15 Axes>,
  <Figure size 1600x400 with 4 Axes>,
  <Figure size 1600x400 with 4 Axes>,
  <Figure size 1600x400 with 4 Axes>],
 None]

DESCRIPTIVE STATISTICS¶

In [41]:
# describing the data
data.describe()
Out[41]:
ID Year_Birth Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
count 2240.000000 2240.000000 2216.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 ... 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.000000 2240.0 2240.0 2240.000000
mean 5592.159821 1968.805804 52247.251354 0.444196 0.506250 49.109375 303.935714 26.302232 166.950000 37.525446 ... 5.316518 0.072768 0.074554 0.072768 0.064286 0.013393 0.009375 3.0 11.0 0.149107
std 3246.662198 11.984069 25173.076661 0.538398 0.544538 28.962453 336.597393 39.773434 225.715373 54.628979 ... 2.426645 0.259813 0.262728 0.259813 0.245316 0.114976 0.096391 0.0 0.0 0.356274
min 0.000000 1893.000000 1730.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
25% 2828.250000 1959.000000 35303.000000 0.000000 0.000000 24.000000 23.750000 1.000000 16.000000 3.000000 ... 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
50% 5458.500000 1970.000000 51381.500000 0.000000 0.000000 49.000000 173.500000 8.000000 67.000000 12.000000 ... 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
75% 8427.750000 1977.000000 68522.000000 1.000000 1.000000 74.000000 504.250000 33.000000 232.000000 50.000000 ... 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
max 11191.000000 1996.000000 666666.000000 2.000000 2.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 ... 20.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.0 11.0 1.000000

8 rows × 26 columns

In [42]:
# describing the categorical data
data.describe(include = 'object')
Out[42]:
Education Marital_Status Dt_Customer
count 2240 2240 2240
unique 5 8 663
top Graduation Married 31-08-2012
freq 1127 864 12
In [83]:
# checking if there is any NULL data
data.isnull().any().any()
Out[83]:
False

Data Visualization¶

In [46]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (18, 8)

plt.subplot(1, 2, 1)
sns.set(style = 'whitegrid')
sns.distplot(data['Income'])
plt.title('Distribution of Income', fontsize = 20)
plt.xlabel('Range of Income')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.set(style = 'whitegrid')
sns.distplot(data['Recency'], color = 'red')
plt.title('Distribution of Recency', fontsize = 20)
plt.xlabel('Recency')
plt.ylabel('Count')
plt.show()
In [67]:
labels = ['Graduation', 'PhD', 'Master', '2n Cycle', 'Basic']
size = data['Education'].value_counts()
colors = ['yellow', 'green', 'orange', 'blue', 'red']

plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors,labels = labels, shadow = True, startangle = 90, autopct = '%.2f%%')
plt.title('Education type', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()
In [54]:
# checking the sitribution of Income
plt.rcParams['figure.figsize'] = (30, 12)
sns.distplot(data['Income'], color = 'red')
plt.title('Distribution of Income', fontsize = 20)
plt.show()
In [55]:
#checking the distribution of Amount spent on wines
plt.rcParams['figure.figsize'] = (30, 12)
sns.distplot(data['MntWines'], color = 'blue')
plt.title('Distribution of MntWines', fontsize = 20)
plt.show()
In [56]:
#  Education vs Income
plt.rcParams['figure.figsize'] = (18, 10)
sns.boxenplot(data = data, x = 'Education', y = 'Income', palette = 'Blues')
plt.title('Education vs Income', fontsize = 20)
plt.show()
In [57]:
#Education vs Income
plt.rcParams['figure.figsize'] = (18, 7)
sns.violinplot(data = data, x = 'Education', y = 'Income', palette = 'rainbow')
plt.title('Education vs Income', fontsize = 20)
plt.show()
In [59]:
# Education vs Income
plt.rcParams['figure.figsize'] = (18, 7)
sns.stripplot(data = data, x = 'Education', y = 'Income', palette = 'Purples', size = 10)
plt.title('Education vs Income', fontsize = 20)
plt.show()
In [60]:
# Income vs Year of birth
sns.lineplot(data = data, x = 'Income', y = 'Year_Birth', color = 'blue')
plt.title('Income vs Year of birth', fontsize = 20)
plt.show()
In [61]:
#Income vs Education
sns.lineplot(data = data, x = 'Income', y = 'Education', color = 'pink')
plt.title('Income vs Education', fontsize = 20)
plt.show()

Clustering Analysis¶

In [74]:
# Income, and Kidhome

import warnings
warnings.filterwarnings('ignore')

# selecting the Income, and Kidhome Columns from the Data
x = data.loc[:, ['Income', 'Kidhome']].values

# checking the shape of x
print(x.shape)
(2240, 2)
In [75]:
# Checking the data, which we are going to use for the clustering analysis
x_data  = pd.DataFrame(x)
x_data.head()
# where o->Income, and 1->Kidhome
Out[75]:
0 1
0 58138 0
1 46344 1
2 71613 0
3 26646 1
4 58293 1

KMeans Algorithm¶

In [76]:
#The Elbow Method to find the No. of Optimal Clusters

from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(x)
    wcss.append(km.inertia_)
    
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()
In [77]:
#Visualizing the Clusters

plt.style.use('fivethirtyeight')

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)

plt.scatter(x[y_means == 0, 0], x[y_means == 0, 1], s = 100, c = 'pink', label = 'general')
plt.scatter(x[y_means == 1, 0], x[y_means == 1, 1], s = 100, c = 'yellow', label = 'miser')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], s = 100, c = 'cyan', label = 'target')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 100, c = 'magenta', label = 'careful')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 100, c = 'orange', label = 'spendthrift')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')

plt.style.use('fivethirtyeight')
plt.title('K Means Clustering between Income and Kids in the home', fontsize = 20)
plt.xlabel('Income')
plt.ylabel('Kidhome')
plt.legend()
plt.grid()
plt.show()
In [80]:
#Clustering between and Spending Score

from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)

plt.rcParams['figure.figsize'] = (15, 5)
plt.plot(range(1, 11), wcss)
plt.title('K-Means Clustering(The Elbow Method)', fontsize = 20)
plt.xlabel('Kidhome')
plt.ylabel('Income')
plt.grid()
plt.show()
In [82]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
ymeans = kmeans.fit_predict(x)

plt.rcParams['figure.figsize'] = (10, 10)
plt.title('Cluster of Ages', fontsize = 30)

plt.scatter(x[ymeans == 0, 0], x[ymeans == 0, 1], s = 100, c = 'pink', )
plt.scatter(x[ymeans == 1, 0], x[ymeans == 1, 1], s = 100, c = 'orange',)
plt.scatter(x[ymeans == 2, 0], x[ymeans == 2, 1], s = 100, c = 'lightgreen',)
plt.scatter(x[ymeans == 3, 0], x[ymeans == 3, 1], s = 100, c = 'red')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 50, c = 'black')

plt.style.use('fivethirtyeight')
plt.xlabel('Income')
plt.ylabel('Kidhome')
plt.grid()
plt.show()
In [ ]: